Proyecto final - NLP

Electiva NLP

Fabian Camilo Cabrera
Alejandro Fandiño
In [1]:
import re
from collections import Counter
import pandas as pd
import numpy as np
import urllib
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import plotly.io as pio
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
In [2]:
auspol = pd.read_csv("auspol2019.csv")
auspol.head()
Out[2]:
created_at id full_text retweet_count favorite_count user_id user_name user_screen_name user_description user_location user_created_at
0 2019-05-20 09:13:44 1130401208756187136 After the climate election: shellshocked green... 0.0 0.0 9.248486e+07 PIPELINEPETE jocksjig Retired Tradesman and Progressive Anti Conserv... Brisbane, Queensland 2009-11-25 09:19:45
1 2019-05-20 09:13:43 1130401205367140357 @narendramodi @smritiirani Coverage of indian ... 0.0 0.0 7.756474e+08 Narinder Parmar nparmar1957 Life coach & trainer, Motivational speaker, Ma... Wollongong, NSW, AUSTRALIA 2012-08-23 10:20:40
2 2019-05-20 09:13:33 1130401162782371841 @workmanalice Do you know if Facebook is relea... 0.0 0.0 5.687300e+04 Peter Wells peterwells Writes for @theage and @smh on technology and ... Melbourne 2006-12-11 07:38:06
3 2019-05-20 09:13:29 1130401143551434753 @vanbadham We all understand we have a compuls... 0.0 0.0 9.081660e+17 The Realist therealist822 Calls it as I see it. Anti PC, SJW and VS. If ... NaN 2017-09-14 03:10:30
4 2019-05-20 09:13:23 1130401118666809345 Shares were mixed in Asia, with India and Aust... 0.0 0.0 5.260074e+08 Inquirer Business InquirerBiz The official Twitter account of the Inquirer G... Philippines 2012-03-16 03:51:59
In [3]:
location = pd.read_csv("location_geocode.csv")
location.head()
Out[3]:
name lat long
0 Brisbane, Queensland -27.469771 153.025124
1 Wollongong, NSW, AUSTRALIA -34.427812 150.893061
2 Melbourne -37.813628 144.963058
3 Philippines 12.879721 121.774017
4 Australia -25.274398 133.775136
In [4]:
data=auspol.merge(location,how='inner',left_on='user_location',right_on='name')
data.head()
Out[4]:
created_at id full_text retweet_count favorite_count user_id user_name user_screen_name user_description user_location user_created_at name lat long
0 2019-05-20 09:13:44 1130401208756187136 After the climate election: shellshocked green... 0.0 0.0 9.248486e+07 PIPELINEPETE jocksjig Retired Tradesman and Progressive Anti Conserv... Brisbane, Queensland 2009-11-25 09:19:45 Brisbane, Queensland -27.469771 153.025124
1 2019-05-20 09:12:57 1130401009782673408 It is disappointing that @tanya_plibersek has ... 0.0 0.0 5.100258e+07 Matthew Rimmer DrRimmer Professor of IP & Innovation Law @QUTLaw @QUT_... Brisbane, Queensland 2009-06-26 10:17:54 Brisbane, Queensland -27.469771 153.025124
2 2019-05-20 09:02:04 1130398270813949952 'Vote for the climate': NSW demands environmen... 0.0 0.0 9.248486e+07 PIPELINEPETE jocksjig Retired Tradesman and Progressive Anti Conserv... Brisbane, Queensland 2009-11-25 09:19:45 Brisbane, Queensland -27.469771 153.025124
3 2019-05-20 07:25:41 1130374015103537152 This is urgent! “False election claims spark p... 0.0 0.0 3.327641e+08 E Ferri eneferri Writer, researcher, educator in media, adverti... Brisbane, Queensland 2011-07-10 11:20:43 Brisbane, Queensland -27.469771 153.025124
4 2019-05-20 07:12:01 1130370578584461313 This is a great piece, not least because it pl... 0.0 2.0 2.180023e+09 Shahar Hameiri ShaharHameiri Associate Professor @polsisengage @UQ_news. Po... Brisbane, Queensland 2013-11-07 12:39:24 Brisbane, Queensland -27.469771 153.025124
In [5]:
data.isnull().sum()
Out[5]:
created_at             0
id                     0
full_text              0
retweet_count          0
favorite_count         0
user_id                0
user_name              0
user_screen_name       0
user_description    5572
user_location          0
user_created_at        0
name                   0
lat                    0
long                   0
dtype: int64
In [6]:
data=data.dropna()
In [7]:
df = data[['created_at','full_text','retweet_count','favorite_count','user_name','user_description','user_location','lat','long']]
df = df.copy()
df.head()
Out[7]:
created_at full_text retweet_count favorite_count user_name user_description user_location lat long
0 2019-05-20 09:13:44 After the climate election: shellshocked green... 0.0 0.0 PIPELINEPETE Retired Tradesman and Progressive Anti Conserv... Brisbane, Queensland -27.469771 153.025124
1 2019-05-20 09:12:57 It is disappointing that @tanya_plibersek has ... 0.0 0.0 Matthew Rimmer Professor of IP & Innovation Law @QUTLaw @QUT_... Brisbane, Queensland -27.469771 153.025124
2 2019-05-20 09:02:04 'Vote for the climate': NSW demands environmen... 0.0 0.0 PIPELINEPETE Retired Tradesman and Progressive Anti Conserv... Brisbane, Queensland -27.469771 153.025124
3 2019-05-20 07:25:41 This is urgent! “False election claims spark p... 0.0 0.0 E Ferri Writer, researcher, educator in media, adverti... Brisbane, Queensland -27.469771 153.025124
4 2019-05-20 07:12:01 This is a great piece, not least because it pl... 0.0 2.0 Shahar Hameiri Associate Professor @polsisengage @UQ_news. Po... Brisbane, Queensland -27.469771 153.025124
In [8]:
print('Dimensión de la data:', df.shape)
print('Nombre de las columnas:', '\n' ,df.columns)
Dimensión de la data: (129808, 9)
Nombre de las columnas: 
 Index(['created_at', 'full_text', 'retweet_count', 'favorite_count',
       'user_name', 'user_description', 'user_location', 'lat', 'long'],
      dtype='object')
In [9]:
print('Máximo de retweets:', df.retweet_count.max())
print('Maximo de tweets favoritos:', df.favorite_count.max())
Máximo de retweets: 6622.0
Maximo de tweets favoritos: 15559.0
In [10]:
print('Fecha y hora de creación:', df.created_at.min())
print('Fecha y hora de último registro:', df.created_at.max())
Fecha y hora de creación: 2019-05-10 16:58:15
Fecha y hora de último registro: 2019-05-20 11:40:07

Visualizaciones

Serie de tiempo

In [11]:
df['created_at'] =  pd.to_datetime(df['created_at'])
fecha = df['created_at'].dt.date.value_counts()
fecha = fecha.sort_index()
In [12]:
fecha = fecha.to_frame().reset_index().rename(columns={'index':'date','created_at':'count'})
fecha.head()
Out[12]:
date count
0 2019-05-10 1225
1 2019-05-11 5550
2 2019-05-12 7493
3 2019-05-13 8148
4 2019-05-14 8294
In [13]:
fig=go.Figure(go.Scatter(x=fecha['date'],
                         y=fecha['count'],
                         mode='markers+lines',
                         name="Submissions",
                         marker_color='dodgerblue'),
                         )

fig.update_layout(
    title_text='Tweets por día : ({} - {}) '.format(fecha['date'].sort_values()[0].strftime("%d/%m/%Y"),
                                                       fecha['date'].sort_values().iloc[-1].strftime("%d/%m/%Y")),template="plotly_white",
    title_x=0.5,height=450)

fig.show()

De lo anterior, vemos que durante diez días se recopilaron los tweets entre los dias 10 y 20 de mayo de 2019. Vemos que hay un pico alto en el día 18 de mayo que coincide con la fecha de las elecciones oficiales en Australia.

In [14]:
dia=df.loc[(df['created_at'].dt.day == 18), 'created_at']
horas=dia.dt.hour.value_counts()
horas=horas.sort_index()
horas=horas.to_frame().reset_index().rename(columns={'index':'hora','created_at':'cnt'})
horas.head()
Out[14]:
hora cnt
0 0 1625
1 1 1690
2 2 1537
3 3 1342
4 4 1239
In [15]:
fig = go.Figure(go.Scatter(x=horas['hora'],
                                y=horas['cnt'],
                                mode='markers+lines',
                                marker_color=horas['cnt'])) # hover text goes here

fig.update_layout(
    title_text='Horas del día : ({} - {}) horas '.format(horas['hora'].sort_values()[0],
                                                       horas['hora'].sort_values().iloc[-1]),template="plotly_white",
    title_x=0.5, height=500)

fig.show()

Para el día de las elecciones (18 de mayo) encontramos que entre las 8am hasta las 3pm hubo un flujo mayor de partición en twitter, ya que coincide con la hora de apertura y cierre de las elecciones en Australia.

Número de tweets por usuario

In [17]:
plt.figure(figsize=(14,7))
usuarios = df['user_name'].value_counts()
usuarios = usuarios[:15,]
sns.barplot(usuarios.values, usuarios.index,data=df)
plt.title("Top 10 de los números de tweets por usuario")
plt.xticks(rotation='horizontal')
plt.show()

Hastag

In [18]:
def encontrar_hash(text):
    line=re.findall(r'(?<=#)\w+',text)
    return " ".join(line)

df['hash']=df['full_text'].apply(lambda x:encontrar_hash(x))
In [19]:
hastags=list(df[(df['hash'].notnull())&(df['hash']!="")]['hash'])
hastags = [each_string.lower() for each_string in hastags]
hash_df=dict(Counter(hastags))
top_hash_df=pd.DataFrame(list(hash_df.items()),columns = ['word','count']).sort_values('count',ascending=False)[:10]
top_hash_df
Out[19]:
word count
0 auspol 25888
13 ausvotes 14495
17 auspol ausvotes 6395
91 ausvotes auspol 3003
15 auspol ausvotes2019 2496
2279 ausvotes 7news 1787
198 auspol ausvotes19 1265
26 ausvotes2019 auspol 753
239 auspol ausvotes ausvotes2019 700
704 australiadecides 623
In [20]:
fig = go.Figure(go.Bar(
    x=top_hash_df['word'],y=top_hash_df['count'],
    marker={'color': top_hash_df['count'], 
    'colorscale': 'reds'},  
    text=top_hash_df['count'],
    textposition = "outside",
))
fig.update_layout(title_text='Los 10 hastags más comunes',xaxis_title="Hashtags",
                  yaxis_title="frecuencia",template="plotly_white",height=700,title_x=0.5)
fig.show()

Menciones

In [21]:
def mentions(text):
    line=re.findall(r'(?<=@)\w+',text)
    return " ".join(line)
In [22]:
df['mentions']=df['full_text'].apply(lambda x:mentions(x))

temp=df['mentions'].value_counts()[:][1:11]
temp =temp.to_frame().reset_index().rename(columns={'index':'Mentions','mentions':'count'})

temp.head()
Out[22]:
Mentions count
0 ScottMorrisonMP 1565
1 billshortenmp 957
2 TonyAbbottMHR 715
3 AustralianLabor 697
4 abcnews 677
In [23]:
fig = go.Figure(go.Bar(
    x=temp['Mentions'],y=temp['count'],
    marker={'color': temp['count'], 
    'colorscale': 'greens'},  
    text=temp['count'],
    textposition = "outside",
))
fig.update_layout(title_text='Las 10 menciones más comunes',xaxis_title="Mentions",
                  yaxis_title="frecuencia",template="plotly_white",height=700,title_x=0.5)
fig.show()

Emojis

In [24]:
import regex
import emoji

def get_emojis(text):
    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

df['emojis'] = df['full_text'].apply(lambda text: get_emojis(text))
In [25]:
from collections import Counter

temp_emojis = pd.DataFrame(list(zip(Counter(sum(df.emojis.values,[])).keys(),Counter(sum(df.emojis.values,[])).values())))
temp_emojis.columns = ['emoji', 'cnt']
temp_emojis.sort_values('cnt', ascending=False, inplace=True)
temp_emojis.head()
Out[25]:
emoji cnt
26 😂 2136
10 🇦🇺 1182
27 🤣 903
46 👏 690
23 😡 636
In [26]:
trace = go.Bar(x=temp_emojis.emoji.values[:10],
               y=temp_emojis.cnt.values[:10])

layout = go.Layout(title='Emojis más usados')

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

Palabras más comunes

In [36]:
from nltk.corpus import stopwords
stopwords_sp = stopwords.words('english')

def pre_procesado1(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    texto = [palabra for palabra in texto.split() if palabra not in stopwords_sp]
    texto = " ".join(texto)
    return texto

df['pp'] = df['clean'].apply(lambda texto: pre_procesado1(texto))
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer().fit(df['pp'])
bag_of_words = vec.transform(df['pp'])
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
words_freq
Out[36]:
[('election', 24604),
 ('australia', 23297),
 ('vote', 14587),
 ('labor', 12879),
 ('amp', 10823),
 ('people', 7764),
 ('party', 6538),
 ('morrison', 6447),
 ('like', 6412),
 ('one', 6304),
 ('get', 5920),
 ('australian', 5324),
 ('time', 5254),
 ('change', 5046),
 ('government', 5008),
 ('shorten', 4941),
 ('climate', 4784),
 ('federal', 4639),
 ('win', 4622),
 ('good', 4619),
 ('would', 4436),
 ('day', 4407),
 ('coalition', 4316),
 ('us', 4262),
 ('today', 4251),
 ('bill', 4035),
 ('liberal', 3877),
 ('voting', 3840),
 ('go', 3814),
 ('pm', 3810),
 ('lnp', 3792),
 ('right', 3763),
 ('campaign', 3744),
 ('going', 3659),
 ('news', 3613),
 ('know', 3602),
 ('scott', 3577),
 ('minister', 3562),
 ('think', 3544),
 ('years', 3508),
 ('country', 3404),
 ('see', 3381),
 ('well', 3241),
 ('via', 3228),
 ('need', 3198),
 ('new', 3197),
 ('last', 3194),
 ('bob', 3163),
 ('want', 3148),
 ('prime', 3125),
 ('live', 3080),
 ('political', 3041),
 ('seat', 2957),
 ('politics', 2942),
 ('hawke', 2933),
 ('next', 2920),
 ('make', 2892),
 ('great', 2880),
 ('back', 2802),
 ('voters', 2795),
 ('leader', 2733),
 ('abbott', 2715),
 ('still', 2625),
 ('voted', 2599),
 ('policy', 2578),
 ('democracy', 2566),
 ('australians', 2558),
 ('polls', 2557),
 ('votes', 2510),
 ('could', 2498),
 ('even', 2490),
 ('really', 2485),
 ('let', 2450),
 ('way', 2449),
 ('first', 2434),
 ('many', 2417),
 ('says', 2416),
 ('got', 2380),
 ('night', 2363),
 ('say', 2336),
 ('result', 2334),
 ('much', 2333),
 ('never', 2288),
 ('alp', 2285),
 ('better', 2279),
 ('results', 2256),
 ('media', 2232),
 ('victory', 2228),
 ('tony', 2194),
 ('abc', 2151),
 ('take', 2137),
 ('tonight', 2129),
 ('polling', 2119),
 ('another', 2067),
 ('left', 2057),
 ('may', 2051),
 ('queensland', 2024),
 ('tax', 2009),
 ('saturday', 2001),
 ('parties', 1993),
 ('big', 1975),
 ('please', 1957),
 ('future', 1938),
 ('policies', 1899),
 ('lost', 1853),
 ('hope', 1837),
 ('also', 1811),
 ('palmer', 1809),
 ('world', 1805),
 ('every', 1801),
 ('look', 1801),
 ('coverage', 1782),
 ('said', 1779),
 ('done', 1752),
 ('seats', 1747),
 ('best', 1738),
 ('work', 1714),
 ('two', 1699),
 ('greens', 1696),
 ('senate', 1687),
 ('public', 1676),
 ('put', 1614),
 ('sausage', 1600),
 ('liberals', 1588),
 ('come', 1570),
 ('clive', 1569),
 ('power', 1537),
 ('thing', 1527),
 ('anyone', 1506),
 ('wrong', 1500),
 ('tomorrow', 1493),
 ('home', 1487),
 ('nation', 1487),
 ('fuck', 1477),
 ('candidate', 1473),
 ('support', 1464),
 ('real', 1459),
 ('end', 1450),
 ('money', 1447),
 ('yet', 1442),
 ('must', 1428),
 ('man', 1415),
 ('nothing', 1395),
 ('poll', 1374),
 ('national', 1369),
 ('long', 1359),
 ('ever', 1358),
 ('everyone', 1350),
 ('stop', 1339),
 ('week', 1339),
 ('conservative', 1336),
 ('love', 1336),
 ('women', 1333),
 ('always', 1317),
 ('give', 1315),
 ('keep', 1312),
 ('sure', 1299),
 ('made', 1287),
 ('far', 1287),
 ('year', 1258),
 ('economy', 1257),
 ('morning', 1245),
 ('help', 1242),
 ('days', 1239),
 ('govt', 1234),
 ('state', 1232),
 ('needs', 1222),
 ('believe', 1220),
 ('qld', 1217),
 ('leadership', 1210),
 ('thanks', 1209),
 ('parliament', 1206),
 ('majority', 1204),
 ('looking', 1197),
 ('early', 1197),
 ('dutton', 1188),
 ('getting', 1187),
 ('oh', 1179),
 ('looks', 1176),
 ('libs', 1174),
 ('actually', 1171),
 ('former', 1169),
 ('action', 1166),
 ('life', 1164),
 ('hard', 1162),
 ('enough', 1160),
 ('thought', 1156),
 ('electorate', 1155),
 ('might', 1146),
 ('happy', 1135),
 ('yes', 1127),
 ('something', 1120),
 ('lies', 1115),
 ('watch', 1111),
 ('old', 1110),
 ('line', 1106),
 ('around', 1103),
 ('lot', 1098),
 ('care', 1093),
 ('read', 1089),
 ('candidates', 1082),
 ('coal', 1082),
 ('things', 1071),
 ('call', 1068),
 ('scomo', 1062),
 ('trump', 1060),
 ('major', 1059),
 ('someone', 1047),
 ('already', 1043),
 ('remember', 1033),
 ('least', 1012),
 ('pay', 1012),
 ('bad', 1011),
 ('feel', 1009),
 ('called', 1006),
 ('job', 1006),
 ('murdoch', 1004),
 ('speech', 1003),
 ('shit', 997),
 ('since', 995),
 ('green', 994),
 ('wins', 992),
 ('find', 991),
 ('saying', 985),
 ('run', 984),
 ('warringah', 980),
 ('million', 974),
 ('elections', 972),
 ('away', 972),
 ('thank', 962),
 ('house', 957),
 ('show', 933),
 ('elected', 930),
 ('tell', 927),
 ('seems', 927),
 ('maybe', 927),
 ('auspol', 926),
 ('local', 913),
 ('important', 907),
 ('school', 906),
 ('lose', 905),
 ('question', 902),
 ('little', 902),
 ('politicians', 892),
 ('pre', 891),
 ('fucking', 891),
 ('opposition', 891),
 ('hey', 888),
 ('three', 882),
 ('anything', 879),
 ('economic', 878),
 ('hell', 876),
 ('top', 876),
 ('twitter', 874),
 ('jobs', 873),
 ('place', 871),
 ('booth', 870),
 ('without', 869),
 ('final', 868),
 ('gone', 867),
 ('full', 865),
 ('mean', 862),
 ('free', 858),
 ('watching', 852),
 ('ahead', 851),
 ('health', 837),
 ('less', 836),
 ('part', 832),
 ('bit', 824),
 ('cuts', 818),
 ('running', 813),
 ('close', 813),
 ('plan', 811),
 ('family', 806),
 ('start', 804),
 ('seen', 801),
 ('god', 794),
 ('miracle', 793),
 ('times', 788),
 ('coming', 777),
 ('social', 775),
 ('independent', 775),
 ('working', 774),
 ('trying', 773),
 ('issue', 769),
 ('howard', 768),
 ('environment', 764),
 ('nsw', 762),
 ('kids', 757),
 ('team', 756),
 ('sydney', 756),
 ('issues', 754),
 ('else', 752),
 ('true', 752),
 ('gets', 751),
 ('young', 748),
 ('ballot', 747),
 ('business', 742),
 ('shock', 742),
 ('system', 740),
 ('sad', 740),
 ('streaming', 739),
 ('fact', 737),
 ('given', 730),
 ('point', 729),
 ('story', 728),
 ('strong', 727),
 ('defeat', 727),
 ('making', 726),
 ('use', 725),
 ('happened', 723),
 ('chance', 722),
 ('self', 722),
 ('post', 719),
 ('john', 719),
 ('wait', 713),
 ('community', 709),
 ('fair', 705),
 ('set', 705),
 ('children', 704),
 ('likely', 701),
 ('hear', 698),
 ('mp', 696),
 ('poor', 695),
 ('peter', 694),
 ('racist', 691),
 ('person', 691),
 ('stand', 691),
 ('promise', 688),
 ('deal', 688),
 ('wonder', 687),
 ('tv', 687),
 ('fear', 687),
 ('become', 685),
 ('workers', 682),
 ('ask', 680),
 ('forward', 679),
 ('pretty', 679),
 ('past', 677),
 ('leaders', 675),
 ('number', 673),
 ('high', 672),
 ('surprise', 669),
 ('across', 664),
 ('lead', 664),
 ('everything', 663),
 ('makes', 662),
 ('told', 662),
 ('swing', 661),
 ('means', 656),
 ('opinion', 655),
 ('moment', 651),
 ('funding', 650),
 ('probably', 647),
 ('interesting', 646),
 ('goes', 645),
 ('bring', 645),
 ('shows', 644),
 ('negative', 644),
 ('talk', 640),
 ('talking', 638),
 ('death', 638),
 ('weekend', 637),
 ('head', 633),
 ('rest', 632),
 ('melbourne', 632),
 ('hate', 630),
 ('small', 629),
 ('ago', 628),
 ('barnaby', 628),
 ('cut', 627),
 ('wing', 625),
 ('count', 623),
 ('guess', 622),
 ('launch', 622),
 ('went', 619),
 ('wants', 617),
 ('rather', 615),
 ('though', 614),
 ('term', 613),
 ('either', 613),
 ('winning', 611),
 ('lib', 611),
 ('choice', 610),
 ('anning', 607),
 ('primary', 605),
 ('history', 605),
 ('conservatives', 602),
 ('reason', 601),
 ('ok', 600),
 ('idea', 599),
 ('brexit', 599),
 ('planet', 595),
 ('class', 593),
 ('forget', 592),
 ('franking', 592),
 ('instead', 591),
 ('turnbull', 589),
 ('matter', 587),
 ('understand', 587),
 ('latest', 587),
 ('low', 587),
 ('seriously', 586),
 ('open', 585),
 ('huge', 583),
 ('keating', 583),
 ('truth', 582),
 ('current', 580),
 ('united', 574),
 ('white', 574),
 ('almost', 573),
 ('course', 572),
 ('thinking', 572),
 ('claims', 571),
 ('questions', 570),
 ('loss', 569),
 ('check', 569),
 ('used', 568),
 ('different', 568),
 ('others', 567),
 ('centre', 562),
 ('taking', 561),
 ('sausages', 559),
 ('adani', 559),
 ('anti', 555),
 ('feeling', 553),
 ('zali', 553),
 ('asked', 553),
 ('side', 551),
 ('calling', 549),
 ('general', 548),
 ('gt', 548),
 ('continue', 546),
 ('th', 546),
 ('clear', 546),
 ('front', 545),
 ('sorry', 544),
 ('counted', 544),
 ('aussie', 544),
 ('cost', 544),
 ('move', 543),
 ('living', 543),
 ('decision', 541),
 ('hours', 540),
 ('answer', 539),
 ('supporters', 539),
 ('face', 538),
 ('credits', 537),
 ('outcome', 536),
 ('able', 536),
 ('lie', 536),
 ('paper', 533),
 ('trust', 530),
 ('yesterday', 530),
 ('message', 529),
 ('agree', 529),
 ('case', 528),
 ('fight', 528),
 ('save', 527),
 ('stupid', 527),
 ('form', 527),
 ('behind', 526),
 ('analysis', 525),
 ('lying', 523),
 ('happen', 521),
 ('single', 521),
 ('cannot', 520),
 ('numbers', 520),
 ('try', 519),
 ('energy', 518),
 ('water', 518),
 ('act', 517),
 ('heard', 517),
 ('wa', 517),
 ('labour', 517),
 ('stunning', 516),
 ('hold', 515),
 ('problem', 514),
 ('despite', 510),
 ('per', 510),
 ('ad', 509),
 ('member', 509),
 ('ads', 509),
 ('office', 504),
 ('agenda', 504),
 ('tanya', 503),
 ('absolutely', 503),
 ('join', 500),
 ('wow', 496),
 ('follow', 495),
 ('paid', 495),
 ('listen', 495),
 ('words', 495),
 ('press', 494),
 ('mr', 493),
 ('whole', 493),
 ('education', 489),
 ('respect', 488),
 ('losing', 487),
 ('victoria', 487),
 ('interest', 485),
 ('heart', 484),
 ('bloody', 483),
 ('rich', 480),
 ('war', 479),
 ('comes', 479),
 ('schools', 477),
 ('key', 475),
 ('etc', 474),
 ('uap', 474),
 ('progressive', 470),
 ('failed', 470),
 ('vision', 467),
 ('housing', 467),
 ('loses', 467),
 ('taxes', 465),
 ('mate', 464),
 ('nice', 463),
 ('turn', 462),
 ('ready', 462),
 ('game', 461),
 ('attack', 461),
 ('clearly', 461),
 ('fraser', 459),
 ('wake', 458),
 ('quite', 457),
 ('name', 456),
 ('yeah', 456),
 ('dear', 456),
 ('plibersek', 455),
 ('wish', 455),
 ('based', 455),
 ('breaking', 455),
 ('buy', 453),
 ('upset', 452),
 ('wanted', 451),
 ('giving', 450),
 ('retain', 449),
 ('took', 448),
 ('global', 446),
 ('human', 446),
 ('together', 446),
 ('worth', 445),
 ('industry', 442),
 ('advertising', 441),
 ('exactly', 440),
 ('hand', 440),
 ('proud', 440),
 ('came', 440),
 ('campaigning', 439),
 ('guy', 438),
 ('paul', 438),
 ('booths', 437),
 ('imagine', 437),
 ('debt', 435),
 ('mind', 432),
 ('friends', 431),
 ('hanson', 430),
 ('half', 429),
 ('south', 429),
 ('kind', 429),
 ('finally', 428),
 ('budget', 428),
 ('voter', 427),
 ('deserve', 427),
 ('record', 427),
 ('promises', 426),
 ('bbc', 426),
 ('waiting', 424),
 ('mine', 424),
 ('whether', 424),
 ('politician', 423),
 ('safe', 422),
 ('tweet', 421),
 ('disappointed', 421),
 ('blame', 421),
 ('biggest', 421),
 ('uk', 421),
 ('claim', 421),
 ('word', 420),
 ('volunteers', 419),
 ('soon', 419),
 ('men', 418),
 ('luck', 418),
 ('antony', 418),
 ('shame', 417),
 ('using', 417),
 ('worst', 417),
 ('share', 416),
 ('aus', 416),
 ('joyce', 415),
 ('chaos', 413),
 ('seeing', 412),
 ('rights', 412),
 ('sense', 407),
 ('saw', 407),
 ('preferences', 406),
 ('lol', 404),
 ('amazing', 404),
 ('wages', 404),
 ('six', 404),
 ('steggall', 404),
 ('vale', 404),
 ('leave', 403),
 ('scare', 402),
 ('short', 401),
 ('afford', 401),
 ('worse', 401),
 ('dead', 400),
 ('difference', 400),
 ('market', 399),
 ('apparently', 398),
 ('taken', 398),
 ('especially', 397),
 ('electoral', 396),
 ('hit', 396),
 ('minute', 396),
 ('stuff', 394),
 ('nationals', 394),
 ('sick', 393),
 ('cards', 392),
 ('knew', 391),
 ('article', 390),
 ('needed', 389),
 ('cast', 389),
 ('laws', 388),
 ('piece', 388),
 ('billion', 388),
 ('held', 388),
 ('congratulations', 387),
 ('age', 387),
 ('guardian', 387),
 ('dickson', 387),
 ('rip', 387),
 ('expect', 386),
 ('months', 386),
 ('woman', 385),
 ('happening', 384),
 ('hoping', 383),
 ('knows', 383),
 ('spending', 380),
 ('stay', 380),
 ('play', 379),
 ('chinese', 379),
 ('telling', 379),
 ('happens', 378),
 ('ausvotes', 377),
 ('report', 376),
 ('albo', 374),
 ('aec', 374),
 ('spent', 374),
 ('second', 373),
 ('st', 372),
 ('union', 372),
 ('bye', 372),
 ('massive', 371),
 ('service', 371),
 ('society', 371),
 ('views', 370),
 ('refugees', 370),
 ('seem', 369),
 ('daily', 368),
 ('fucked', 368),
 ('ha', 367),
 ('gearing', 366),
 ('perhaps', 365),
 ('guys', 365),
 ('doubt', 363),
 ('including', 363),
 ('found', 363),
 ('corruption', 362),
 ('wtf', 362),
 ('enjoy', 361),
 ('unexpected', 360),
 ('late', 360),
 ('speaking', 359),
 ('gonna', 357),
 ('longer', 355),
 ('complete', 355),
 ('exit', 354),
 ('compulsory', 354),
 ('china', 354),
 ('statement', 354),
 ('return', 354),
 ('impact', 354),
 ('list', 353),
 ('asking', 353),
 ('truly', 353),
 ('canberra', 353),
 ('minority', 352),
 ('penny', 352),
 ('income', 351),
 ('odds', 351),
 ('dog', 350),
 ('role', 349),
 ('aussies', 348),
 ('pollsters', 348),
 ('democratic', 347),
 ('explain', 347),
 ('positive', 346),
 ('fake', 346),
 ('preference', 346),
 ('oz', 346),
 ('bullshit', 346),
 ('lower', 346),
 ('town', 346),
 ('takes', 345),
 ('gave', 345),
 ('propaganda', 345),
 ('serious', 345),
 ('deliver', 344),
 ('reality', 343),
 ('room', 342),
 ('thinks', 342),
 ('counting', 341),
 ('paying', 341),
 ('lives', 341),
 ('religious', 341),
 ('possible', 340),
 ('weeks', 340),
 ('lack', 339),
 ('race', 338),
 ('non', 338),
 ('calls', 338),
 ('pauline', 337),
 ('along', 336),
 ('according', 336),
 ('ppl', 336),
 ('within', 335),
 ('whatever', 335),
 ('prices', 335),
 ('completely', 335),
 ('sky', 335),
 ('members', 334),
 ('towards', 334),
 ('due', 334),
 ('surely', 334),
 ('middle', 333),
 ('thread', 333),
 ('sign', 333),
 ('price', 332),
 ('decided', 331),
 ('mining', 329),
 ('elect', 329),
 ('broadcasting', 329),
 ('growth', 328),
 ('gov', 328),
 ('corrupt', 327),
 ('nyt', 327),
 ('allowed', 324),
 ('spend', 324),
 ('changes', 324),
 ('following', 324),
 ('group', 323),
 ('comments', 323),
 ('decide', 323),
 ('account', 323),
 ('wong', 323),
 ('dark', 322),
 ('putting', 321),
 ('video', 321),
 ('marginal', 320),
 ('pick', 320),
 ('foreign', 319),
 ('speak', 319),
 ('carbon', 319),
 ('choose', 318),
 ('earth', 318),
 ('opportunity', 317),
 ('credit', 317),
 ('christian', 316),
 ('actual', 316),
 ('values', 316),
 ('accused', 316),
 ('hopefully', 315),
 ('greatest', 315),
 ('waste', 315),
 ('rise', 315),
 ('literally', 315),
 ('started', 315),
 ('millions', 314),
 ('expected', 314),
 ('increase', 314),
 ('handing', 313),
 ('wage', 313),
 ('learn', 312),
 ('banks', 312),
 ('services', 312),
 ('wentworth', 311),
 ('voice', 311),
 ('position', 310),
 ('law', 310),
 ('surprised', 310),
 ('private', 310),
 ('corporation', 310),
 ('senator', 309),
 ('view', 309),
 ('states', 308),
 ('send', 308),
 ('malcolm', 308),
 ('outside', 308),
 ('turned', 307),
 ('totally', 306),
 ('facebook', 306),
 ('wife', 305),
 ('worked', 305),
 ('rate', 305),
 ('card', 304),
 ('western', 304),
 ('plans', 303),
 ('honest', 303),
 ('currently', 303),
 ('interests', 303),
 ('west', 302),
 ('science', 302),
 ('bet', 301),
 ('easy', 301),
 ('campaigns', 301),
 ('dont', 301),
 ('police', 301),
 ('scottmorrisonmp', 301),
 ('compared', 300),
 ('seizes', 300),
 ('countries', 299),
 ('sizzle', 299),
 ('page', 299),
 ('rates', 299),
 ('faith', 299),
 ('higher', 299),
 ('desperate', 299),
 ('unlosable', 298),
 ('zero', 298),
 ('sunday', 297),
 ('security', 297),
 ('standing', 297),
 ('crap', 297),
 ('personal', 296),
 ('independents', 296),
 ('except', 296),
 ('reminder', 296),
 ('toxic', 296),
 ('simply', 296),
 ('ideas', 296),
 ('aged', 296),
 ('cent', 296),
 ('anyway', 296),
 ('fund', 295),
 ('company', 295),
 ('bowen', 294),
 ('changed', 294),
 ('hands', 294),
 ('definitely', 293),
 ('medicare', 292),
 ('led', 292),
 ('level', 291),
 ('welfare', 291),
 ('appears', 290),
 ('regional', 290),
 ('tells', 290),
 ('church', 290),
 ('focus', 290),
 ('moving', 289),
 ('ffs', 289),
 ('gay', 289),
 ('terrible', 289),
 ('decides', 288),
 ('supporting', 288),
 ('chris', 287),
 ('mother', 285),
 ('glad', 285),
 ('showing', 285),
 ('ones', 285),
 ('surplus', 285),
 ('commission', 284),
 ('note', 284),
 ('billshortenmp', 283),
 ('certainly', 282),
 ('vic', 282),
 ('simple', 282),
 ('supposed', 282),
 ('control', 281),
 ('dirty', 281),
 ('north', 281),
 ('research', 281),
 ('yep', 280),
 ('crisis', 280),
 ('nz', 280),
 ('track', 279),
 ('racism', 278),
 ('mum', 278),
 ('die', 278),
 ('risk', 278),
 ('red', 278),
 ('later', 278),
 ('preferred', 277),
 ('environmental', 276),
 ('contest', 276),
 ('reform', 276),
 ('reports', 275),
 ('deputy', 274),
 ('challenge', 273),
 ('reef', 273),
 ('welcome', 273),
 ('build', 272),
 ('religion', 272),
 ('club', 272),
 ('push', 271),
 ('fine', 271),
 ('consider', 271),
 ('realise', 271),
 ('four', 270),
 ('facts', 270),
 ('indigenous', 270),
 ('emergency', 270),
 ('freedom', 270),
 ('murderers', 270),
 ('dumb', 269),
 ('minutes', 269),
 ('battle', 268),
 ('cash', 268),
 ('whitlam', 268),
 ('stage', 267),
 ('ministers', 266),
 ('fall', 265),
 ('tough', 265),
 ('evidence', 265),
 ('winner', 264),
 ('data', 264),
 ('york', 264),
 ('buyers', 264),
 ('sounds', 263),
 ('brilliant', 263),
 ('generation', 263),
 ('investment', 263),
 ('influence', 263),
 ('wealthy', 262),
 ('common', 262),
 ('ya', 262),
 ('listening', 262),
 ('interview', 262),
 ('works', 261),
 ('vs', 261),
 ('legacy', 261),
 ('absolute', 261),
 ('avoid', 261),
 ('blue', 260),
 ('guide', 259),
 ('hour', 259),
 ('correct', 259),
 ('lots', 258),
 ('anymore', 258),
 ('funded', 258),
 ('michael', 258),
 ('sort', 257),
 ('folks', 257),
 ('plus', 256),
 ('bloke', 256),
 ('promised', 256),
 ('five', 256),
 ('reading', 256),
 ('jones', 256),
 ('yrs', 255),
 ('order', 255),
 ('matters', 255),
 ('excellent', 255),
 ('female', 254),
 ('brought', 254),
 ('beer', 254),
 ('beyond', 253),
 ('fun', 253),
 ('zealand', 253),
 ('predictions', 253),
 ('air', 253),
 ('poverty', 253),
 ('un', 252),
 ('volunteer', 252),
 ('debate', 252),
 ('deposit', 252),
 ('couple', 251),
 ('leading', 251),
 ('however', 251),
 ('delivered', 251),
 ('beautiful', 251),
 ('emissions', 251),
 ('bed', 250),
 ('mark', 250),
 ('nobody', 249),
 ('co', 249),
 ('total', 249),
 ('wonderful', 249),
 ('josh', 249),
 ('straight', 249),
 ('eurovision', 249),
 ('decent', 248),
 ('fighting', 248),
 ('greed', 248),
 ('car', 248),
 ('miss', 248),
 ('box', 248),
 ('bs', 248),
 ...]

Nube de palabras

In [24]:
import re

pattern1 = '?P<pic>pic.twitter.com/[^\s]+'
pattern2 = '?P<url>https?://[^\s]+'

def text_clean(row):
    text = row['full_text']
    
    links = [tuple(j for j in i if j)[-1] for i in re.findall(f"({pattern1})|({pattern2})",text)]
    for link in links:
        text = text.replace(link,"")
             
    hashtags = [interaction for interaction in text.split() if interaction.startswith("#")]
    for hashtag in hashtags:
        text = text.replace(hashtag,"")
        
    mentions = [interaction for interaction in text.split() if interaction.startswith("@")]
    for mention in mentions:
        text = text.replace(mention,"")
        
    return text, links, hashtags, mentions
In [27]:
df[['clean', 'links', 'hashtags', 'mentions']] = df.apply(text_clean, axis=1, result_type='expand')
In [29]:
import PIL.Image
australia = PIL.Image.open('australiaMapa.png')
#display(australia)
In [30]:
mask = np.array(australia)
In [31]:
palabras = df.clean.str.cat(sep=" ")
wordcloud = WordCloud(width=800, height=400,
                     max_font_size=150, max_words=250,
                     background_color='white', colormap='copper_r',
                     stopwords=stopwords_sp,
                     mask=mask, contour_width=0.5, contour_color='green').generate(palabras) # https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html

#wordcloud.to_file("australia.jpg")

plt.figure(figsize=(10,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

Se puede ver que hay al menos dos partidos en Australia: "Laborista" y "Liberal". Uno de estos dos partidos está dirigido por Scott Morrison, otro puede estar dirigido por Bill Shorten.

Mapa de calor

In [32]:
import re
from nltk.corpus import stopwords
stopwords_sp = stopwords.words('english')
from sklearn.feature_extraction.text import TfidfVectorizer

def pre_procesado(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    texto = " ".join([palabra for palabra in texto.split() if palabra not in stopwords_sp])
    return texto

tfidf_vect = TfidfVectorizer(preprocessor=pre_procesado, ngram_range=(1,1), max_features=100)
tfidf = tfidf_vect.fit_transform(df.clean.values)
In [33]:
from sklearn.metrics.pairwise import cosine_similarity

voc = [k for k,v in sorted(tfidf_vect.vocabulary_.items(), key=lambda kv: kv[1])]
temp = pd.DataFrame(tfidf.toarray())
temp = cosine_similarity(temp.T.values)
temp = pd.DataFrame(temp)
temp.columns = voc
temp.index = voc
temp
Out[33]:
abbott abc alp amp another australia australian australians back better ... voted voters votes voting want way well win would years
abbott 1.000000 0.014262 0.005224 0.018468 0.009986 0.015163 0.012217 0.005098 0.017143 0.012778 ... 0.009632 0.009098 0.009321 0.002495 0.008768 0.010441 0.016119 0.020378 0.015254 0.015729
abc 0.014262 1.000000 0.028927 0.023630 0.008148 0.026225 0.038204 0.008369 0.014988 0.012292 ... 0.006596 0.009708 0.051690 0.004497 0.011536 0.008335 0.013630 0.012339 0.011380 0.012819
alp 0.005224 0.028927 1.000000 0.030668 0.009421 0.012035 0.009743 0.006152 0.012354 0.011713 ... 0.014046 0.019797 0.036109 0.008500 0.011176 0.013522 0.012004 0.034327 0.022604 0.011086
amp 0.018468 0.023630 0.030668 1.000000 0.026982 0.058911 0.033266 0.031887 0.030980 0.032452 ... 0.026599 0.035142 0.018154 0.028521 0.038388 0.032221 0.032352 0.026729 0.039332 0.034822
another 0.009986 0.008148 0.009421 0.026982 1.000000 0.029417 0.016131 0.008915 0.018088 0.008201 ... 0.009965 0.010575 0.006368 0.010029 0.021820 0.019006 0.021979 0.017720 0.014866 0.111310
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
way 0.010441 0.008335 0.013522 0.032221 0.019006 0.036827 0.019373 0.015824 0.023787 0.016446 ... 0.019107 0.026359 0.013544 0.028104 0.022391 1.000000 0.017655 0.015544 0.024623 0.016263
well 0.016119 0.013630 0.012004 0.032352 0.021979 0.042637 0.012236 0.013439 0.018066 0.016827 ... 0.016761 0.010758 0.010030 0.016015 0.016450 0.017655 1.000000 0.016332 0.026682 0.022527
win 0.020378 0.012339 0.034327 0.026729 0.017720 0.093731 0.058644 0.013323 0.018839 0.013702 ... 0.014302 0.022446 0.026097 0.008636 0.020506 0.015544 0.016332 1.000000 0.035417 0.016776
would 0.015254 0.011380 0.022604 0.039332 0.014866 0.040433 0.020460 0.020724 0.019567 0.032453 ... 0.020552 0.019741 0.010836 0.022163 0.026158 0.024623 0.026682 0.035417 1.000000 0.024329
years 0.015729 0.012819 0.011086 0.034822 0.111310 0.043926 0.026638 0.016152 0.022451 0.013679 ... 0.019406 0.022161 0.005616 0.014487 0.024205 0.016263 0.022527 0.016776 0.024329 1.000000

100 rows × 100 columns

In [34]:
import scipy.cluster.hierarchy as sch
import numpy as np

pairwise_distances = sch.distance.pdist(temp)
linkage = sch.linkage(pairwise_distances, method='ward')
idx_to_cluster_array = sch.fcluster(linkage, pairwise_distances.max() * 1, criterion='distance')
idx = np.argsort(idx_to_cluster_array)
temp = temp.copy()
    
temp2 = temp.iloc[idx, :].T.iloc[idx, :]
my_idx = idx_to_cluster_array
In [35]:
trace = go.Heatmap(z=temp2.values.tolist(),
                   x=temp2.index.values,
                   y=temp2.columns.values,
                   colorscale='Oranges')

layout = go.Layout(title='🔥 Mapa de calor entre palabras (ordenado)🔥',
                   width=800, height=800)

fig = go.Figure(data=[trace],layout=layout)
iplot(fig)

Ubicación de los usuarios

In [47]:
localizacion = df[['lat','long']]
localizacion.columns = localizacion.columns.str.strip()
In [48]:
import folium
from folium import plugins
from folium.plugins import HeatMap
In [49]:
mapa = folium.Map(location=[-25.274398,133.775136],zoom_start = 5)
In [50]:
heat_df = localizacion[['lat', 'long']]
heat_df = heat_df.dropna(axis=0, subset=['lat','long'])

# List comprehension para hacer una lista de listas
heat_data = [[row['lat'],row['long']] for index, row in heat_df.iterrows()]

HeatMap(heat_data).add_to(mapa)

mapa
Out[50]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Gráfico de puntos

In [53]:
map_osm = folium.Map(location=[-25.274398,133.775136],zoom_start=15)
In [54]:
subset_of_df = localizacion.sample(n=1000,random_state=1)
In [55]:
some_map = folium.Map(location=[ subset_of_df['lat'].mean(),
                                 subset_of_df['long'].mean()],
                                zoom_start=10)

for row in subset_of_df.itertuples():
    some_map.add_child(folium.Marker(location=[row.lat,row.long]))
In [56]:
some_map
Out[56]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Análisis de sentimiento

In [27]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

df['sentimiento'] = df['full_text'].apply(lambda valor: vader.polarity_scores(valor)['compound'])
df.head()
Out[27]:
created_at full_text retweet_count favorite_count user_name user_description user_location lat long hash mentions emojis sentimiento
0 2019-05-20 09:13:44 After the climate election: shellshocked green... 0.0 0.0 PIPELINEPETE Retired Tradesman and Progressive Anti Conserv... Brisbane, Queensland -27.469771 153.025124 [] 0.2732
1 2019-05-20 09:12:57 It is disappointing that @tanya_plibersek has ... 0.0 0.0 Matthew Rimmer Professor of IP & Innovation Law @QUTLaw @QUT_... Brisbane, Queensland -27.469771 153.025124 auspol tanya_plibersek AustralianLabor [] -0.4404
2 2019-05-20 09:02:04 'Vote for the climate': NSW demands environmen... 0.0 0.0 PIPELINEPETE Retired Tradesman and Progressive Anti Conserv... Brisbane, Queensland -27.469771 153.025124 [] 0.0000
3 2019-05-20 07:25:41 This is urgent! “False election claims spark p... 0.0 0.0 E Ferri Writer, researcher, educator in media, adverti... Brisbane, Queensland -27.469771 153.025124 [] 0.6476
4 2019-05-20 07:12:01 This is a great piece, not least because it pl... 0.0 2.0 Shahar Hameiri Associate Professor @polsisengage @UQ_news. Po... Brisbane, Queensland -27.469771 153.025124 [] 0.6597
In [28]:
cut = pd.cut(df['sentimiento'],[-np.inf, -.01, .01, np.inf],labels=['negativo', 'neutro', 'positivo'])
df['polaridad'] = cut.values
df[['polaridad','sentimiento']].head()
Out[28]:
polaridad sentimiento
0 positivo 0.2732
1 negativo -0.4404
2 neutro 0.0000
3 positivo 0.6476
4 positivo 0.6597
In [29]:
df[['full_text', 'sentimiento','polaridad']].sort_values('sentimiento')
Out[29]:
full_text sentimiento polaridad
100584 @songfestival @kaneurovision #NORWAY WON THE T... -0.9988 negativo
104385 fuck fuck fuck fuck fuck fuck fuck fuck fuck f... -0.9987 negativo
4980 Lies. Lies. Lies. Lies. Lies. Lies. Lies. Lies... -0.9971 negativo
38666 fuck fuck fuck fuck fuck fuck fuck fuck fuck f... -0.9953 negativo
89413 FACT CHECK on Coalition's adds claiming new ta... -0.9944 negativo
... ... ... ...
82768 @smh 🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣... 0.9991 positivo
11134 😂💧😂💧😂💧😂💧😱💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧😂💧... 0.9992 positivo
126871 #Election2019Results\n#AusVotes2019 \n#auspol ... 0.9993 positivo
5445 Old mate throwing $1M on Labor with @ladbrokes... 0.9998 positivo
48618 I am now going to watch the political coverage... 0.9998 positivo

129808 rows × 3 columns

In [30]:
print('Número de tweets negativos:' ,len(df.sentimiento.loc[df.sentimiento<0]))
Número de tweets negativos: 39692
In [31]:
print('Número de tweets positivos:' ,len(df.sentimiento.loc[df.sentimiento>0]))
Número de tweets positivos: 56277
In [32]:
print('Número de tweets neutros:' ,len(df.sentimiento.loc[df.sentimiento==0]))
Número de tweets neutros: 33839
In [33]:
plt.figure(figsize=(14, 6))
polaridad = df['polaridad'].value_counts()
splot=sns.barplot(x=polaridad.index,y=polaridad.values,data=df)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.1f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
plt.ylabel("Frecuencia", size=14)
Out[33]:
Text(0, 0.5, 'Frecuencia')
In [47]:
labels = df.polaridad.value_counts().index
colors = ['blue','darkorange','yellow']
sizes = df.polaridad.value_counts().values
plt.figure(0,figsize = (7,7))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%')
plt.title('Sentimientos',color = 'blue',fontsize = 15)
plt.show()

Modelo LDA

In [58]:
import pyLDAvis.gensim
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint
In [59]:
from nltk.corpus import stopwords
stopwords_sp = stopwords.words('english')

def pre_procesado1(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    texto = [palabra for palabra in texto.split() if palabra not in stopwords_sp]
    return texto
In [60]:
df['pre_procesado'] = df['full_text'].apply(lambda texto: pre_procesado1(texto))
In [61]:
df.head()
Out[61]:
created_at full_text retweet_count favorite_count user_name user_description user_location lat long hash mentions emojis sentimiento polaridad pre_procesado
0 2019-05-20 09:13:44 After the climate election: shellshocked green... 0.0 0.0 PIPELINEPETE Retired Tradesman and Progressive Anti Conserv... Brisbane, Queensland -27.469771 153.025124 [] 0.2732 positivo [climate, election, shellshocked, green, group...
1 2019-05-20 09:12:57 It is disappointing that @tanya_plibersek has ... 0.0 0.0 Matthew Rimmer Professor of IP & Innovation Law @QUTLaw @QUT_... Brisbane, Queensland -27.469771 153.025124 auspol tanya_plibersek AustralianLabor [] -0.4404 negativo [disappointing, tanya, plibersek, ruled, austr...
2 2019-05-20 09:02:04 'Vote for the climate': NSW demands environmen... 0.0 0.0 PIPELINEPETE Retired Tradesman and Progressive Anti Conserv... Brisbane, Queensland -27.469771 153.025124 [] 0.0000 neutro [vote, climate, nsw, demands, environmental, s...
3 2019-05-20 07:25:41 This is urgent! “False election claims spark p... 0.0 0.0 E Ferri Writer, researcher, educator in media, adverti... Brisbane, Queensland -27.469771 153.025124 [] 0.6476 positivo [urgent, false, election, claims, spark, push,...
4 2019-05-20 07:12:01 This is a great piece, not least because it pl... 0.0 2.0 Shahar Hameiri Associate Professor @polsisengage @UQ_news. Po... Brisbane, Queensland -27.469771 153.025124 [] 0.6597 positivo [great, piece, least, places, last, elections,...
In [62]:
# Crear una representación de los documentos en forma de diccionario
dictionary = Dictionary(df['pre_procesado'].values)
In [63]:
# Filtrar palabras muy frecuentes o infrecuentes
dictionary.filter_extremes(no_below=5, no_above=0.5)

corpus = [dictionary.doc2bow(text) for text in df['pre_procesado'].values]
In [64]:
# Train the topic model
model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=7, passes=4)
In [45]:
model.print_topics(num_words=10)
Out[45]:
[(0,
  '0.036*"ausvotes" + 0.013*"vote" + 0.013*"people" + 0.012*"like" + 0.010*"get" + 0.009*"change" + 0.009*"climate" + 0.009*"amp" + 0.008*"australia" + 0.008*"labor"'),
 (1,
  '0.035*"ausvotes" + 0.019*"amp" + 0.011*"lnp" + 0.010*"labor" + 0.008*"scottmorrisonmp" + 0.007*"media" + 0.006*"government" + 0.006*"campaign" + 0.006*"liberalaus" + 0.006*"dutton"'),
 (2,
  '0.033*"ausvotes" + 0.010*"amp" + 0.009*"barnaby" + 0.008*"new" + 0.008*"economy" + 0.008*"tax" + 0.007*"joyce" + 0.007*"health" + 0.005*"public" + 0.005*"low"'),
 (3,
  '0.110*"ausvotes" + 0.022*"shorten" + 0.017*"australiadecides" + 0.017*"bill" + 0.015*"labor" + 0.015*"morrison" + 0.014*"australiavotes" + 0.011*"ausvote" + 0.010*"election" + 0.010*"minister"'),
 (4,
  '0.021*"bob" + 0.020*"hawke" + 0.010*"amp" + 0.008*"tanya" + 0.007*"plibersek" + 0.007*"pm" + 0.007*"australian" + 0.006*"bobhawke" + 0.006*"billshortenmp" + 0.006*"australia"'),
 (5,
  '0.092*"election" + 0.060*"australia" + 0.022*"australian" + 0.022*"news" + 0.019*"federal" + 0.016*"win" + 0.015*"live" + 0.014*"results" + 0.010*"politics" + 0.010*"via"'),
 (6,
  '0.064*"ausvotes" + 0.026*"vote" + 0.015*"election" + 0.012*"australia" + 0.011*"voting" + 0.010*"australiavotes" + 0.010*"democracy" + 0.009*"democracysausage" + 0.009*"day" + 0.008*"polling"')]
In [65]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_display)
Out[65]:

El primer topico vemos que hay una gran cantidad de palabras en la que resalta 'labor' del partido laborista, por otro lado, el topico tres y cuatro tienen palabras comunes y las que se destacan son: 'morrison', 'labor' y 'shorten', esto podria indicar como se dividen los documentos entre los dos candidatos y tambien vemos la palabra 'win' dando una tendencia de ganador a Morrison.

In [66]:
d = dictionary.doc2bow(["shorten", "morrinson"])
topics = model.get_document_topics(d)
topics
Out[66]:
[(0, 0.071429186),
 (1, 0.071429186),
 (2, 0.071429186),
 (3, 0.071429186),
 (4, 0.071551375),
 (5, 0.57130265),
 (6, 0.071429186)]
In [59]:
def get_doc_top_n(text_processed, n):
    d = dictionary.doc2bow(text_processed)
    topics = dict(model.get_document_topics(d))
    try:
        return topic[n]
    except:
        return None
    
get_doc_top_n(["election", "shorten", "morrinson", "australia"],1) 
In [60]:
for t in range(0,7):
    top_name = f"topic_{t}"
    df[top_name] = df['pre_procesado'].apply(lambda doc: get_doc_top_n(doc,t))
In [61]:
for t in range(0,7):
    print(f"****************************** TOPIC {t} ******************************")
    for i,row in enumerate(df.sort_values(f"topic_{t}", ascending=False)['full_text'].head()):
        print(f"tweet #{i+1}")
        print(row[:500])
        print()
    print()
****************************** TOPIC 0 ******************************
tweet #1
After the climate election: shellshocked green groups remain resolute https://t.co/wyJzmAcyiD

tweet #2
It is disappointing that @tanya_plibersek has ruled herself out of the @AustralianLabor leadership challenge - given her qualities https://t.co/n1DaiVRRF7 #auspol

tweet #3
'Vote for the climate': NSW demands environmental short film's removal from internet https://t.co/VrFvB0vyJ4

tweet #4
This is urgent! “False election claims spark push for truth in political advertising laws” https://t.co/iabJ3b8FAO

tweet #5
This is a great piece, not least because it places the last few elections in their context. What we're seeing play out now in Australian politics is the slow and ugly dissolution of politics of the industrial era and its stable 2-party structure https://t.co/TEzbOavWEs


****************************** TOPIC 1 ******************************
tweet #1
After the climate election: shellshocked green groups remain resolute https://t.co/wyJzmAcyiD

tweet #2
It is disappointing that @tanya_plibersek has ruled herself out of the @AustralianLabor leadership challenge - given her qualities https://t.co/n1DaiVRRF7 #auspol

tweet #3
'Vote for the climate': NSW demands environmental short film's removal from internet https://t.co/VrFvB0vyJ4

tweet #4
This is urgent! “False election claims spark push for truth in political advertising laws” https://t.co/iabJ3b8FAO

tweet #5
This is a great piece, not least because it places the last few elections in their context. What we're seeing play out now in Australian politics is the slow and ugly dissolution of politics of the industrial era and its stable 2-party structure https://t.co/TEzbOavWEs


****************************** TOPIC 2 ******************************
tweet #1
After the climate election: shellshocked green groups remain resolute https://t.co/wyJzmAcyiD

tweet #2
It is disappointing that @tanya_plibersek has ruled herself out of the @AustralianLabor leadership challenge - given her qualities https://t.co/n1DaiVRRF7 #auspol

tweet #3
'Vote for the climate': NSW demands environmental short film's removal from internet https://t.co/VrFvB0vyJ4

tweet #4
This is urgent! “False election claims spark push for truth in political advertising laws” https://t.co/iabJ3b8FAO

tweet #5
This is a great piece, not least because it places the last few elections in their context. What we're seeing play out now in Australian politics is the slow and ugly dissolution of politics of the industrial era and its stable 2-party structure https://t.co/TEzbOavWEs


****************************** TOPIC 3 ******************************
tweet #1
After the climate election: shellshocked green groups remain resolute https://t.co/wyJzmAcyiD

tweet #2
It is disappointing that @tanya_plibersek has ruled herself out of the @AustralianLabor leadership challenge - given her qualities https://t.co/n1DaiVRRF7 #auspol

tweet #3
'Vote for the climate': NSW demands environmental short film's removal from internet https://t.co/VrFvB0vyJ4

tweet #4
This is urgent! “False election claims spark push for truth in political advertising laws” https://t.co/iabJ3b8FAO

tweet #5
This is a great piece, not least because it places the last few elections in their context. What we're seeing play out now in Australian politics is the slow and ugly dissolution of politics of the industrial era and its stable 2-party structure https://t.co/TEzbOavWEs


****************************** TOPIC 4 ******************************
tweet #1
After the climate election: shellshocked green groups remain resolute https://t.co/wyJzmAcyiD

tweet #2
It is disappointing that @tanya_plibersek has ruled herself out of the @AustralianLabor leadership challenge - given her qualities https://t.co/n1DaiVRRF7 #auspol

tweet #3
'Vote for the climate': NSW demands environmental short film's removal from internet https://t.co/VrFvB0vyJ4

tweet #4
This is urgent! “False election claims spark push for truth in political advertising laws” https://t.co/iabJ3b8FAO

tweet #5
This is a great piece, not least because it places the last few elections in their context. What we're seeing play out now in Australian politics is the slow and ugly dissolution of politics of the industrial era and its stable 2-party structure https://t.co/TEzbOavWEs


****************************** TOPIC 5 ******************************
tweet #1
After the climate election: shellshocked green groups remain resolute https://t.co/wyJzmAcyiD

tweet #2
It is disappointing that @tanya_plibersek has ruled herself out of the @AustralianLabor leadership challenge - given her qualities https://t.co/n1DaiVRRF7 #auspol

tweet #3
'Vote for the climate': NSW demands environmental short film's removal from internet https://t.co/VrFvB0vyJ4

tweet #4
This is urgent! “False election claims spark push for truth in political advertising laws” https://t.co/iabJ3b8FAO

tweet #5
This is a great piece, not least because it places the last few elections in their context. What we're seeing play out now in Australian politics is the slow and ugly dissolution of politics of the industrial era and its stable 2-party structure https://t.co/TEzbOavWEs


****************************** TOPIC 6 ******************************
tweet #1
After the climate election: shellshocked green groups remain resolute https://t.co/wyJzmAcyiD

tweet #2
It is disappointing that @tanya_plibersek has ruled herself out of the @AustralianLabor leadership challenge - given her qualities https://t.co/n1DaiVRRF7 #auspol

tweet #3
'Vote for the climate': NSW demands environmental short film's removal from internet https://t.co/VrFvB0vyJ4

tweet #4
This is urgent! “False election claims spark push for truth in political advertising laws” https://t.co/iabJ3b8FAO

tweet #5
This is a great piece, not least because it places the last few elections in their context. What we're seeing play out now in Australian politics is the slow and ugly dissolution of politics of the industrial era and its stable 2-party structure https://t.co/TEzbOavWEs


k-means

In [10]:
df_muestra = df.sample(n=500, random_state=1)
In [11]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

def pre_procesado2(texto):
    texto = texto.lower()
    texto = re.sub(r"[\W\d_]+", " ", texto)
    return texto

tfidf_vect = TfidfVectorizer(preprocessor=pre_procesado2)
tfidf = tfidf_vect.fit_transform(df_muestra.full_text.values)
tfidf_matrix = pd.DataFrame(tfidf.toarray(), columns=tfidf_vect.get_feature_names())
In [12]:
# Escogiendo el K nodo
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
In [13]:
ks = []
kinertia = []

for k in range(1,10):
    kmeans = KMeans(n_clusters=k).fit(tfidf_matrix)
    ks.append(k)
    kinertia.append(kmeans.inertia_)
    
plt.xlabel("Número de cúmulos"); 
plt.ylabel("Inercia");
plt.title('Método del codo');
plt.plot(ks,kinertia, 'bx-');
In [19]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

k_pt = 4
model = KMeans(n_clusters=k_pt)
model.fit(tfidf_matrix)
nbrs = NearestNeighbors(n_neighbors=3, metric='euclidean').fit(tfidf_matrix.values)

df_muestra['cluster'] = model.labels_
clust_cnt = df_muestra['cluster'].value_counts()
clust_cnt_pct = df_muestra['cluster'].value_counts(normalize=True)

print(f"{color.BOLD}CLUSTERS:\n{color.END}")
centroids = model.cluster_centers_
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vect.get_feature_names()

for i in range(k_pt):
    print(f"{color.BLUE}Cluster {i}:{color.END}")
    print(f"{color.CYAN}COUNT {color.END} {clust_cnt[i]} comments ({clust_cnt_pct[i]:.2%} of the data)")
    print(f"{color.CYAN}TERMS {color.END}", end=" ")
    for ind in order_centroids[i, :20]:
        print(f'{color.BOLD}{terms[ind]}{color.END}', end=" "),
    print(f"\n{color.CYAN}REPRESENTATIVE COMMENTS{color.END}")
    for comment in df_muestra.iloc[nbrs.kneighbors([centroids[i]])[1][0]]['full_text'].values:
        print(f"* {comment}")
    print("\n")
CLUSTERS:

Cluster 0:
COUNT  22 comments (4.40% of the data)
TERMS  australiavotes auspol the australiadecides feel alp election senatorwong results fuck ausvotes co https libs defeated going skynewsaust win ill lubiephil 
REPRESENTATIVE COMMENTS
* fuck i love @SenatorWong.

#AustraliaVotes #auspol #ausvotes19
* #auspol #australiadecides #australiavotes #election2019 #skynewsaust #abcnews24 
LIBS WIN KOOYONG
* I feel defeated #australiavotes #ausvotes #auspol


Cluster 1:
COUNT  125 comments (25.00% of the data)
TERMS  to the that auspol this in we is ausvotes and it you he australia of https co his out get 
REPRESENTATIVE COMMENTS
* What an amazing night for our nation! If our opponents have learnt one thing, I hope that it is to NEVER underestimate our Liberal Party, the party that is not for the rich or the poor, but for Australians &amp; the Party which Australians always turn to. Thank you Australia! #AusPol
* One thing we do get right is how easy it is to vote in Australia. On a weekend, I had no line and was done in a few mins. #ausvotes
* TOMORROW WE GET TO KICK THIS MOB OUT YA'LL. Remember 2007 when Howard had to front the nation knowing he had lost his seat? Remember that? Let's do that again tomorrow. #auspol


Cluster 2:
COUNT  205 comments (41.00% of the data)
TERMS  the to and of for auspol is ausvotes in on it vote labor you https co are will australia election 
REPRESENTATIVE COMMENTS
* After all the bullshit from left,right and centre in the #AusVotes2019 it must be clear to people it’s all a farce. This #election is just more proof of the scam that democracy in #Australia is. Doesn’t matter what happens next week. #auspol will remain the #USA’s little bitch.
* Voting in Australia is done so well. It’s on a Saturday and you can vote early if you’re busy. The lines go quickly and there’s almost always a bbq going with a little stall for the school hosting the ballots. In and out in 5 minutes and scored a $2 #democracysausage #AUSVote19
* #BobHawke is dead. RIP. Sit back &amp; wait for @LiberalAus to say it was Labor's fault. That's if the country's meanest bastards can stop celebrating the loss of someone greater than the sum of them all. Vote @AustralianLabor for a decent Australia. #AusVotes19 #auspol #ausvotes


Cluster 3:
COUNT  148 comments (29.60% of the data)
TERMS  co https ausvotes auspol election australia for the to at in is news what shorten win morrison of there scott 
REPRESENTATIVE COMMENTS
* #auspol #AusVotes2019 #ausvotes #AusVotes19 https://t.co/RHyysKch23
* #auspol #ausvotes #ausvotes2019 https://t.co/eaYgtPZyyI
* #auspol #AusVotes2019 https://t.co/2n2fnvYkEg


In [20]:
clusters = {0:'1',
            1:'2',
            2:'3',
            3:'4'
           }

df_muestra['cluster'] = df_muestra['cluster'].apply(lambda val: clusters[val])
df_muestra.sample(2)
Out[20]:
created_at full_text retweet_count favorite_count user_name user_description user_location lat long cluster
131067 2019-05-11 08:57:22 #AustraliaDecides is being repeated tonight 8.... 0.0 8.0 Spencer Howson 🍷🐈📻 Industry Fellow and MA student @USQedu, Adviso... Springfield / Brisbane -27.65000 152.900000 3
35626 2019-05-18 10:03:12 Deputy PM Michael McCormack on Tony Abbott's l... 1.0 6.0 sam langford news & politics reporter @junkee, used to edit... Sydney, Australia -33.86882 151.209295 3
In [21]:
df_centroides = pd.DataFrame(model.cluster_centers_)
df_centroides['cluster'] = clusters.values()
df_centroides
Out[21]:
0 1 2 3 4 5 6 7 8 9 ... 3574 3575 3576 3577 3578 3579 3580 3581 3582 cluster
0 0.000000e+00 -1.084202e-19 0.000000 8.673617e-19 0.000000 -1.084202e-19 0.026904 0.000000 0.000000e+00 -1.084202e-19 ... 0.000000e+00 1.084202e-19 5.421011e-20 -1.084202e-19 0.000000e+00 -1.084202e-19 2.168404e-19 -5.421011e-20 -1.084202e-19 1
1 1.576076e-03 1.781892e-03 0.004841 3.825270e-03 0.002152 -1.084202e-19 0.002666 0.002152 1.535488e-03 1.384705e-03 ... -5.421011e-19 7.589415e-19 2.168404e-19 1.084202e-19 -2.168404e-19 -5.421011e-19 -9.757820e-19 0.000000e+00 2.070974e-03 2
2 -1.084202e-19 9.393308e-04 0.005272 9.702913e-03 0.000000 8.673617e-19 0.006552 0.000000 -7.047314e-19 8.946300e-04 ... 0.000000e+00 1.626303e-18 5.421011e-19 1.547008e-03 8.673617e-19 -1.301043e-18 1.409463e-18 -5.421011e-20 -7.589415e-19 3
3 5.963112e-19 1.301043e-18 0.007211 4.336809e-18 0.000000 1.883459e-03 0.002329 0.000000 -5.421011e-19 4.336809e-19 ... 2.253774e-03 2.650778e-03 1.467389e-03 3.252607e-19 1.938562e-03 2.315071e-03 3.092542e-03 1.129186e-03 -5.421011e-19 4

4 rows × 3584 columns

In [22]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot
In [24]:
pca = PCA(n_components=2)

result = pca.fit_transform(tfidf_matrix)
result = pd.DataFrame(result)
result.columns = ['X', 'Y']
result['cluster'] = df_muestra.cluster.values
result['texto'] = df_muestra.full_text.apply(lambda val: val[:140])


colorsIdx = {'1': 'blue',
             '2': 'yellow',
             '3': 'red',
             '4': 'green'
            }

cols = df_muestra['cluster'].map(colorsIdx)

trace = go.Scatter(x=result['X'].values,
                   y=result['Y'].values,
                   text=result['texto'].values,
                   mode='markers',
                   marker=dict(color=cols)) 

layout = go.Layout(title="PCA")

fig = go.Figure(data=trace, layout=layout)
iplot(fig)

Vemos para este caso que el K-means no es el mejor algoritmo para aplicar sobre estos datos porque están muy relacionados los clusters el uno con el otro, por esta razón es dificil ver claramente cuales son los cluster.